Data analysis with non-hierarchical clustering (algorithm k-means) on data about line distribution in movies. The analysis was made on the dataset Polygraph’s Film Dialogue. Information about the dataset and how it was generated can be found in its original repository.
readr::read_csv(here("data/character_list5.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_character_name = col_character(),
words = col_integer(),
gender = col_character(),
age = col_character()
)) %>%
mutate(age = as.numeric(age)) -> characters_list
readr::read_csv(here("data/meta_data7.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_id = col_character(),
title = col_character(),
year = col_integer(),
gross = col_integer(),
lines_data = col_character()
)) %>%
mutate(title = iconv(title,"latin1", "UTF-8")) -> meta_data
left_join(characters_list,
meta_data,
by=c("script_id")) %>%
group_by(title, year) %>%
drop_na(gross) %>%
ungroup() -> scripts_data
scripts_data %>%
glimpse()
Observations: 19,387
Variables: 10
$ script_id <int> 280, 280, 280, 280, 280, 280, 280, 623, 623, 623, 623, 623, 623, 623, 623, 623, 623, 623, 623, 623, 623...
$ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "francesca johns", "madge", "michael johnson", "robert kincaid",...
$ words <int> 311, 873, 138, 2251, 190, 723, 1908, 328, 409, 347, 2020, 366, 160, 1337, 1683, 148, 801, 608, 596, 166...
$ gender <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f", "m", "m", "m", "m", "m", "m", "f", "f", "m", "m", "f", "m"...
$ age <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58, 53, 25, 39, 33, NA, 34, 34, 46, 26, 25, 42, 47, 32, 31, 46,...
$ imdb_id <chr> "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt0179626",...
$ title <chr> "The Bridges of Madison County", "The Bridges of Madison County", "The Bridges of Madison County", "The...
$ year <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2001, 2...
$ gross <int> 142, 142, 142, 142, 142, 142, 142, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 376, 376, 37...
$ lines_data <chr> "4332023434343443203433434334433434343434434344344333434443444344233434273", "4332023434343443203433434...
scripts_data %>%
mutate(fem_words = ifelse(gender == "f",words,0),
man_words = ifelse(gender == "m",words,0)) %>%
group_by(title, year) %>%
mutate(total_fem_words = sum(fem_words),
total_man_words = sum(man_words)) %>%
filter(total_fem_words != 0) %>%
filter(total_man_words != 0) %>%
mutate(f_m_ratio = sum(gender == "f")/sum(gender == "m"),
f_m_wordratio = total_fem_words/total_man_words) %>%
ungroup() -> scripts_data
scripts_data %>%
select(title,
year,
f_m_ratio,
f_m_wordratio) %>%
sample_n(10)
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=f_m_wordratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
boundary = 0,
fill = "grey",
color = "black") +
labs(y="Relative Frequency",
x="female/male wordratio")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
filter(f_m_wordratio < 10) %>%
ggplot(aes(x=f_m_wordratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
fill = "grey",
color = "black") +
labs(y="Relative Frequency",
x="female/male wordratio")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=f_m_wordratio)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="female/male wordratio") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=f_m_ratio,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.1,
boundary = 0,
fill = "grey",
color = "black") +
scale_x_continuous(breaks = seq(0,10,0.5)) +
labs(y="Relative Frequency",
x="(female chars / male chars) ratio")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=f_m_ratio)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="(female chars / male chars) ratio") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=year)) +
geom_bar(fill = "grey",
color = "black") +
labs(y="Absolute Frequency",
x="Year of release")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=year)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="Year of release") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x=gross,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 50,
boundary = 0,
fill = "grey",
color = "black") +
labs(y="Relative Frequency", x="Gross")
scripts_data %>%
group_by(title,year) %>%
slice(1) %>%
unique() %>%
ggplot(aes(x="",
y=gross)) +
geom_violin(fill="grey",
width=0.5) +
labs(y="Gross") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
scripts_data %>%
group_by(title) %>%
slice(1) %>%
unique() %>%
ungroup() %>%
select(title,
gross,
f_m_ratio,
f_m_wordratio) -> data
select(data, -title) %>%
mutate_all(funs(scale)) -> scaled_data
scaled_data %>%
sample_n(10)
The GAP Statistic compares the grouping results with each available k in a dataset where there isn’t grouping structure.
plot_clusgap = function(clusgap, title="Gap Statistic calculation results"){
require("ggplot2")
gstab = data.frame(clusgap$Tab, k=1:nrow(clusgap$Tab))
p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size=5)
p = p + geom_errorbar(aes(ymax=gap+SE.sim, ymin=gap-SE.sim), width = .2)
p = p + ggtitle(title)
return(p)
}
gaps <- scaled_data %>%
clusGap(FUN = kmeans,
nstart = 20,
K.max = 8,
B = 200,
iter.max=30)
Clustering k = 1,2,..., K.max (= 8): .. done
Bootstrapping, b = 1,2,..., B (= 200) [one "." per sample]:
.................................................. 50
.................................................. 100
.................................................. 150
.................................................. 200
plot_clusgap(gaps)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(scaled_data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
fitting ...
|
| | 0%
|
|= | 0%
|
|= | 1%
|
|== | 1%
|
|== | 2%
|
|=== | 2%
|
|==== | 3%
|
|===== | 4%
|
|====== | 5%
|
|======= | 5%
|
|======= | 6%
|
|======== | 6%
|
|======== | 7%
|
|========= | 7%
|
|========== | 8%
|
|=========== | 9%
|
|============ | 9%
|
|============= | 10%
|
|============== | 11%
|
|=============== | 12%
|
|================ | 12%
|
|================ | 13%
|
|================= | 13%
|
|================= | 14%
|
|================== | 14%
|
|=================== | 15%
|
|==================== | 16%
|
|===================== | 17%
|
|====================== | 18%
|
|======================= | 18%
|
|======================== | 19%
|
|========================= | 20%
|
|========================== | 20%
|
|========================== | 21%
|
|=========================== | 21%
|
|=========================== | 22%
|
|============================ | 22%
|
|============================= | 23%
|
|============================== | 24%
|
|=============================== | 25%
|
|================================ | 25%
|
|================================ | 26%
|
|================================= | 26%
|
|================================= | 27%
|
|================================== | 27%
|
|=================================== | 27%
|
|=================================== | 28%
|
|==================================== | 28%
|
|==================================== | 29%
|
|===================================== | 29%
|
|====================================== | 30%
|
|======================================= | 31%
|
|======================================== | 32%
|
|========================================= | 32%
|
|========================================= | 33%
|
|========================================== | 33%
|
|========================================== | 34%
|
|=========================================== | 34%
|
|============================================ | 35%
|
|============================================= | 36%
|
|============================================== | 36%
|
|=============================================== | 37%
|
|================================================ | 38%
|
|================================================= | 39%
|
|================================================== | 39%
|
|================================================== | 40%
|
|=================================================== | 40%
|
|=================================================== | 41%
|
|==================================================== | 41%
|
|===================================================== | 42%
|
|====================================================== | 43%
|
|======================================================= | 44%
|
|======================================================== | 44%
|
|======================================================== | 45%
|
|========================================================= | 45%
|
|========================================================== | 46%
|
|=========================================================== | 46%
|
|=========================================================== | 47%
|
|============================================================ | 47%
|
|============================================================ | 48%
|
|============================================================= | 48%
|
|============================================================== | 49%
|
|=============================================================== | 50%
|
|================================================================ | 51%
|
|================================================================= | 52%
|
|================================================================== | 52%
|
|================================================================== | 53%
|
|=================================================================== | 53%
|
|=================================================================== | 54%
|
|==================================================================== | 54%
|
|===================================================================== | 55%
|
|====================================================================== | 55%
|
|====================================================================== | 56%
|
|======================================================================= | 56%
|
|======================================================================== | 57%
|
|========================================================================= | 58%
|
|========================================================================== | 59%
|
|=========================================================================== | 59%
|
|=========================================================================== | 60%
|
|============================================================================ | 60%
|
|============================================================================ | 61%
|
|============================================================================= | 61%
|
|============================================================================== | 62%
|
|=============================================================================== | 63%
|
|================================================================================ | 64%
|
|================================================================================= | 64%
|
|================================================================================== | 65%
|
|=================================================================================== | 66%
|
|==================================================================================== | 66%
|
|==================================================================================== | 67%
|
|===================================================================================== | 67%
|
|===================================================================================== | 68%
|
|====================================================================================== | 68%
|
|======================================================================================= | 69%
|
|======================================================================================== | 70%
|
|========================================================================================= | 71%
|
|========================================================================================== | 71%
|
|========================================================================================== | 72%
|
|=========================================================================================== | 72%
|
|=========================================================================================== | 73%
|
|============================================================================================ | 73%
|
|============================================================================================= | 73%
|
|============================================================================================= | 74%
|
|============================================================================================== | 74%
|
|============================================================================================== | 75%
|
|=============================================================================================== | 75%
|
|================================================================================================ | 76%
|
|================================================================================================= | 77%
|
|================================================================================================== | 78%
|
|=================================================================================================== | 78%
|
|=================================================================================================== | 79%
|
|==================================================================================================== | 79%
|
|==================================================================================================== | 80%
|
|===================================================================================================== | 80%
|
|====================================================================================================== | 81%
|
|======================================================================================================= | 82%
|
|======================================================================================================== | 82%
|
|========================================================================================================= | 83%
|
|========================================================================================================== | 84%
|
|=========================================================================================================== | 85%
|
|============================================================================================================ | 86%
|
|============================================================================================================= | 86%
|
|============================================================================================================= | 87%
|
|============================================================================================================== | 87%
|
|============================================================================================================== | 88%
|
|=============================================================================================================== | 88%
|
|================================================================================================================ | 89%
|
|================================================================================================================= | 90%
|
|================================================================================================================== | 91%
|
|=================================================================================================================== | 91%
|
|==================================================================================================================== | 92%
|
|===================================================================================================================== | 93%
|
|====================================================================================================================== | 93%
|
|====================================================================================================================== | 94%
|
|======================================================================================================================= | 94%
|
|======================================================================================================================= | 95%
|
|======================================================================================================================== | 95%
|
|========================================================================================================================= | 96%
|
|========================================================================================================================== | 97%
|
|=========================================================================================================================== | 98%
|
|============================================================================================================================ | 98%
|
|============================================================================================================================ | 99%
|
|============================================================================================================================= | 99%
|
|============================================================================================================================= | 100%
|
|==============================================================================================================================| 100%
set.seed(23)
nb <- NbClust(scaled_data, diss=NULL, distance = "euclidean",
min.nc=2, max.nc=5, method = "kmeans",
index = "all", alphaBeale = 0.1)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 3 proposed 2 as the best number of clusters
* 7 proposed 3 as the best number of clusters
* 11 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 4
*******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))
set.seed(23)
n_clusters = 3
scaled_data %>%
kmeans(n_clusters, iter.max = 100, nstart = 20) -> km
p <- autoplot(km, data=scaled_data, frame = TRUE)
ggplotly(p)
set.seed(23)
row.names(scaled_data) <- data$title
toclust <- scaled_data %>%
rownames_to_column(var = "title")
km = toclust %>%
select(-title) %>%
kmeans(centers = n_clusters, iter.max = 100, nstart = 20)
km %>%
augment(toclust) %>%
gather(key = "variável", value = "valor", -title, -.cluster) %>%
ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) +
geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster) +
coord_flip()
\(\color{red}{\text{Grupo 1}}\) - We Can Do It!
O \(\color{red}{\text{Grupo 1}}\) - We Can Do It! é o grupo de filmes de maior representação feminina, quer seja em proporção de personagens femininos como em proporção de dialógos dedicados a personagens femininos. Existe porém uma característica negativa que acompanha este mesmo grupo, pois este é também o grupo das menores taxas de faturamento. Isso sugere uma infeliz associação negativa entre a representação feminina em filmes e o faturamento destes.
O nome do grupo se refere ao famoso cartaz de J. Howard Miller de 1943 incentivado as mulheres a participar no esforço de guerra nas fábricas.
\(\color{green}{\text{Grupo 2}}\) - It’s A Man’s Man’s Man’s World
O \(\color{green}{\text{Grupo 2}}\) - It’s A Man’s Man’s Man’s World é o grupo de filmes de menor representação feminina, quer seja em proporção de personagens femininos como em proporção de dialógos dedicados a personagens femininos. Existe porém uma característica negativa que acompanha este mesmo grupo, pois este é também o grupo de maiores taxas de faturamento. Isso sugere uma infeliz associação positiva entre ausência de representação feminina em filmes e o faturamento destes.
O nome do grupo se refere à música de James Brown, a qual foi escrita por sua então namorada Betty Jean Newsome como um comentário sobre a relação entre os sexos.
\(\color{blue}{\text{Grupo 3}}\) - Em cima do muro
O nome do grupo se refere à expressão que significa não tomar partido.
set.seed(23)
dists = scaled_data %>%
dist()
scaled_data %>%
kmeans(3, iter.max = 100, nstart = 20) -> km
silhouette(km$cluster, dists) %>%
plot(col = RColorBrewer::brewer.pal(4, "Set2"),border=NA)